# Load libraries
import pandas as pd
import os
import re
import sys
from ftfy import fix_text # for text cleaning
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def name_match(names_a, names_b):

    # Define function to clean company names and transform into ngrams
    def ngrams(string, n = 3):
        string = str(string)
        string = string.lower() # lower case
        string = fix_text(string) # fix text
        string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
        string = re.split('[(]',string)[0]
        chars_to_remove = [")","(",".","|","[","]","{","}","'","-"]
        rx = '[' + re.escape(''.join(chars_to_remove)) + ']' #remove punc, brackets etc...
        string = re.sub(rx, '', string)
        string = string.replace('&', ' and ')
        string = string.replace(' & ', ' and ')
        string = string.replace('[0-9]{5,}', '')
        string = string.title() # normalise case - capital at start of each word
        string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
        string = ' '+ string +' ' # pad names for ngrams...
        ngrams = zip(*[string[i:] for i in range(n)])
        return [''.join(ngram) for ngram in ngrams]
    
    # Vectorize names to be matched using Tf-Idf weighting
    vectorizer = TfidfVectorizer(min_df = 1, analyzer = ngrams, lowercase = False)

    # Create Tf-Idf weights for the names to be matched
    tfidf = vectorizer.fit_transform(names_b)

    # Fit Tf-Idf weights to nearest k neighbors
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors = 2, n_jobs = -1).fit(tfidf)

    # Turn list of potential names into a set
    set_b = set(names_a)

    # Define function to get nearest k neighbors for a given query
    def getNearestN(query):
        # Vectorize query using Tf-Idf weights
        queryTFIDF_ = vectorizer.transform(query)
        # Identify distances and indices for nearest k neighbors
        distances, indices = nbrs.kneighbors(queryTFIDF_)
        return distances, indices

    distances, indices = getNearestN(set_b)
    
    # Return all matches
    matches = []
    for i,j in enumerate(indices):
        temp = [list(set_b)[i], names_b[j[0]], 1-round(distances[i][0],2), names_b[j[1]], 1-round(distances[i][1],2)]
        matches.append(temp)

    matches = pd.DataFrame(matches, columns = ['name_to_match','match_1', 'match_1_confidence','match_2','match_2_confidence'])

    # Calculate difference in confidence between Match 1 and Match 2
    matches['difference'] = matches['match_1_confidence'] - matches['match_2_confidence']

    # Sort by difference in confidence
    matches = matches.sort_values('difference', ascending = False)
    
    return(matches)